"""
   ┌─┐       ┌─┐ + +
┌──┘ ┴───────┘ ┴──┐++
│                 │
│       ───       │++ + + +
███████───███████ │+
│                 │+
│       ─┴─       │
│                 │
└───┐         ┌───┘
    │         │
    │         │   + +
    │         │
    │         └──────────────┐
    │                        │
    │                        ├─┐
    │                        ┌─┘
    │                        │
    └─┐  ┐  ┌───────┬──┐  ┌──┘  + + + +
      │ ─┤ ─┤       │ ─┤ ─┤
      └──┴──┘       └──┴──┘  + + + +
             神兽保佑
            代码无BUG!
"""
import time

import pandas as pd

import re


class ReadTag:
    def __init__(self):
        self.data_Frame = pd.DataFrame(columns=["id", "img_url", "content", "tags", "img_number", "sina_tag"])
        self.weibo_data = []

    def read_file(self):
        file_data = pd.read_excel("./file/new_weibo_1.xlsx")
        for data in file_data.values:
            self.weibo_data.append(data)
        print(len(self.weibo_data))

    def process_file(self):
        for data in self.weibo_data:
            if not pd.isna(data[0]):
                id_ = str(data[0]).split(".")[0]
                content = data[2]
                img_url = data[1]
                tags_ = data[3]
                img_number = data[4]
                try:
                    tags = re.findall("#(.*?)#", content)
                    if tags:
                        tags = ",".join(set(tags))
                    else:
                        tags = ""
                except:
                    tags = ""
                self.data_Frame = self.data_Frame.append(
                    pd.DataFrame(
                        {"id": [id_], "content": [content], "img_url": [img_url], "tags": [tags_],
                         "img_number": [img_number], "sina_tag": [tags]})
                )
                print(id_, tags)
        self.data_Frame.to_excel("微博.xlsx", '微博内容', index=False)

    def run(self):
        self.read_file()
        self.process_file()


if __name__ == '__main__':
    rt = ReadTag()
    rt.run()
